##### Replication of figure 2

rm(list = ls())
library(tidyverse)
library(igraph)
library(ggraph)
library(data.table)

cenl <- read_csv("nltrial_clean2.csv")

### Create figure 2 ###

PhdUnis <- cenl %>% group_by(PhDUni) %>% summarize(n=n())
PhdUnis <- PhdUnis %>% filter(!is.na(PhDUni))

PhdUnis2 <- "other" %>% as_tibble()
names(PhdUnis2) <- "PhDUni"
PhdUnis2$n <- sum(PhdUnis$n==1)
PhdUnis <- PhdUnis %>% filter(n>1)
PhdUnis <- rbind(PhdUnis, PhdUnis2)
PhdUnis <- arrange(PhdUnis, -n)

PGUnis <- cenl %>% group_by(PGUni) %>% summarize(n=n())
PGUnis <- PGUnis %>% filter(!is.na(PGUni))

PGUnis2 <- "other" %>% as_tibble()
names(PGUnis2) <- "PGUni"
PGUnis2$n <- sum(PGUnis$n==1)
PGUnis <- PGUnis %>% filter(n>1)
PGUnis <- rbind(PGUnis, PGUnis2)
PGUnis <- arrange(PGUnis, -n)


UGUnis <- cenl %>% group_by(UGUni) %>% summarize(n=n())
UGUnis <- UGUnis %>% filter(!is.na(UGUni))

UGUnis2 <- "other" %>% as_tibble()
names(UGUnis2) <- "UGUni"
UGUnis2$n <- sum(UGUnis$n==1)
UGUnis <- UGUnis %>% filter(n>1)
UGUnis <- rbind(UGUnis, UGUnis2)
UGUnis <- arrange(UGUnis, -n)


PGUnis$level <- "2 - PG"
PhdUnis$level <- "3 - PhD"
UGUnis$level <- "1 - UG"

PGUnis$freq <- order(PGUnis$n, decreasing=T)
PGUnis$freq <- PGUnis$freq-1
names(PGUnis)[1] <- "University"
PGUnis$top <- "98 - at least twice"
#PGUnis$top[PGUnis$n >= PGUnis$n[PGUnis$freq==10]] <- "1 - top ten"
PGUnis$top[PGUnis$freq<=10] <- "1 - top ten"

PhdUnis$freq <- order(PhdUnis$n, decreasing=T)
PhdUnis$freq <- PhdUnis$freq-1
names(PhdUnis)[1] <- "University"
PhdUnis$top <- "98 - at least twice"
#PhdUnis$top[PhdUnis$n >= PhdUnis$n[PhdUnis$freq==10]] <- "1 - top ten"
PhdUnis$top[PhdUnis$freq<=10] <- "1 - top ten"

UGUnis$freq <- order(UGUnis$n, decreasing=T)
UGUnis$freq <- UGUnis$freq-1
names(UGUnis)[1] <- "University"
UGUnis$top <- "98 - at least twice"
#UGUnis$top[UGUnis$n >= UGUnis$n[UGUnis$freq==10]] <- "1 - top ten"
UGUnis$top[UGUnis$freq<=10] <- "1 - top ten"

educ <- rbind(PGUnis, PhdUnis, UGUnis)
educ$top[educ$freq==0] <- "3 - appears only once"
educ$freq[educ$freq==0] <- max(educ$freq)+1

educ$top2 <- educ$top
educ$top2[educ$top=="1 - top ten"] <- paste(educ$freq[educ$top=="1 - top ten"], educ$University[educ$top=="1 - top ten"], sep=" - ")

educ$label <- educ$top2
educ$label[educ$label=="98 - at least twice" | educ$label=="3 - appears only once"] <- ""

educ$col <- "white"
educ$col[educ$top2=="98 - at least twice"] <- "black"
educ$col[educ$top2=="3 - appears only once"] <- "grey"


educ2 <- educ %>% filter(top=="98 - at least twice") %>% group_by(level) %>% summarise(n=sum(n))
educ2$University <- "at least twice"
educ2$top <- "2 - at least twice"
educ2$top2 <- "2 - at least twice"
educ2$label <- ""
educ2$col <- "black"
educ2$freq <- NA

neweduc <- rbind(educ %>% filter(col!="black"), educ2)

neweduc %>%
  ggplot(aes(x = level, y = n, fill = top, colour = label)) + 
  geom_bar(stat = "identity", position = "fill", show.legend = T, colour="darkgrey") +
  geom_text(aes(label = label), size = 3, hjust = 0.5, vjust = 1.2, position = "fill", colour="black") +
  ggtitle("Distribution of educational institutions attended") +
  scale_y_reverse() + 
  scale_fill_brewer(labels=c('top ten', 'at least twice', 'appears only once'), palette="Greys", name="frequency in\ndatabase") +
  xlab("") + ylab("")+
  theme_minimal() + theme(legend.position="bottom")

